library(tidyverse)
library(plotly)
library(sf)
library(mapview)
library(tigris)
library(censusapi)
library(leaflet)
library(usmap)
library(lmtest)
library(pracma)
library(forecast)
library(vars)
library(rvest)
library(RSelenium)
library(seleniumPipes)
library(dLagM)
library(jsonlite)
library(rgdal)
library(readr)
library(arsenal)
options(
tigris_class = "sf",
tigris_use_cache = TRUE
)
Sys.setenv(CENSUS_KEY="10dcd73d7c043e91bac9fb8d3989cbff54b08790")
# SET your path here to the covid19analysis folder.
sg_path <- '/Users/simonespeizer/pCloud Drive/Shared/SFBI/Restricted Data Library/Safegraph/covid19analysis/'
sg_hourly_path <- '/Users/simonespeizer/pCloud Drive/Shared/SFBI/Restricted Data Library/Safegraph/covid19analysis/weekly-patterns/v2/hourly/'
# Load the processing functions. This also loads the normalization functions.
source("safegraph_process_patterns_functions.R")
This Rmd is a continuation of v8, in which I explored the predictive ability of visits over time, but examines visits to businesses disaggregated by NAICS code.
Load case data.
# load block groups and their correspondence to ZCTAs
ac_bg_zctas <- readRDS("Simone_Speizer/ac_bg_ztcas.rds")
ac_bgs <- ac_bg_zctas %>% pull(blockgroup)
# load the case data
# note this is downloaded manually
ac_place_cases <- read.csv("Simone_Speizer/Alameda_County_Cumulative_Cases_By_Place_And_Zip.csv")
# handle the ones that are reported as <10 by calling them 5 cases. Note this is a simplification/choice I made and could be changed.
ac_cases_zip <- ac_place_cases %>%
rename(date = DtCreate) %>%
mutate(date = date %>% substr(1,10) %>% as.Date()) %>%
dplyr::select(c(date, contains("F9"))) %>% # only select zip code data
gather(key = "zipcode", value = "cases", -date) %>%
mutate(cases = ifelse(cases == "<10", "5", cases),
zipcode = zipcode %>% substr(2,6)) %>% # replace cases <10 with 10 and remove the "F" from zipcode names
mutate(cases = as.numeric(cases)) %>%
arrange(date)
# get Alameda County populations by zip code
# census data
acs_vars = readRDS("Simone_Speizer/censusData2018_acs_acs5.rds")
# define a function for pulling census data
pullCensus <- function(variableToPull, county) {
regionString <- paste0("state:06+county:", county)
censusData <- getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = regionString,
vars = variableToPull
) %>%
mutate(blockgroup = paste0(state,county,tract,block_group)) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME"))
return(censusData)
}
# get population data
ac_fips <- fips("CA", "Alameda") %>% substr(3,5)
ac_pop_zip <- pullCensus("B01003_001E", ac_fips) %>%
left_join(ac_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>%
group_by(zipcode) %>%
summarize(total_pop_zip = sum(B01003_001E))
# join with cases
ac_cases_zip <- ac_cases_zip %>% left_join(ac_pop_zip) %>%
mutate(cases_by_pop = cases / total_pop_zip)
Load and process weekly visits data, grouping by NAICS code. Note that this is limited to visits to POIs in Bay Area counties.
# poi_ca <- readRDS(paste0(sg_path,'ca_poi.rds'))
#
# # names of Bay Area counties
# bay_county_names <-
# c(
# "Alameda",
# "Contra Costa",
# "Marin",
# "Napa",
# "San Francisco",
# "San Mateo",
# "Santa Clara",
# "Solano",
# "Sonoma"
# )
#
# # function to get weekly visits by zip code, by 2 digit NAICS code, for a particular county, as well as weekly visits to any particular NAICS codes of interest. inputs are the name of the patterns file for the week of interest, the name of the home panel summary file for the week of interest, a data frame of the block groups in the county and the zip codes that they correspond to (with column names blockgroup and zipcode), and a vector of the specific NAICS codes of interest.
# # note that this limits just to POIs in the Bay Area.
# processPatternsWeeklyNAICS <- function(patterns_file, hps_file, county_bg_zctas, specific_NAICS, date_start) {
#
# # load weekly patterns and the hps file. had a problem reading one file as a RDS so handle that below.
# if (date_start == "2020-03-23") {
# patterns <- read.csv(paste0(sg_path,'weekly-patterns/v2/main-file/', substr(patterns_file, 1, nchar(patterns_file) - 4), ".csv"))
# } else {
# patterns <- readRDS(paste0(sg_path,'weekly-patterns/v2/main-file/', patterns_file))
# }
#
# patterns <- patterns %>% left_join(
# poi_ca %>%
# dplyr::select(
# safegraph_place_id,
# longitude,
# latitude,
# naics_code
# ),
# by = "safegraph_place_id"
# ) %>%
# filter(!is.na(longitude)) %>%
# mutate(
# long = longitude,
# lat = latitude
# ) %>%
# st_as_sf(coords = c("long","lat")) %>%
# st_set_crs(4326) %>%
# st_join(
# counties("CA", cb=F, progress_bar=F) %>%
# filter(NAME %in% bay_county_names) %>%
# st_transform(4326) %>%
# dplyr::select(NAME),
# left=F
# ) %>%
# mutate(
# naics_4dig = as.character(naics_code %/% 100),
# naics_2dig = as.character(naics_code %/% 10000),
# ) %>% as.data.frame()
#
# hps <- readRDS(paste0(sg_path,'weekly-patterns/v2/home-summary-file/', hps_file))
#
# county_patterns_weekly <- normBG(patterns, hps) %>% dplyr::select(origin_census_block_group, safegraph_place_id, visit_counts_high, visit_counts_low, naics_code, naics_4dig, naics_2dig) %>% filter(origin_census_block_group %in% county_bg_zctas$blockgroup) %>%
# left_join(county_bg_zctas %>% as.data.frame() %>% dplyr::select(blockgroup, zipcode), by = c("origin_census_block_group" = "blockgroup"))
#
# county_patterns_by_NAICS_2dig <- county_patterns_weekly %>%
# group_by(naics_2dig, zipcode) %>%
# summarize(total_visits_high = sum(visit_counts_high),
# total_visits_low = sum(visit_counts_low)) %>%
# arrange(zipcode)
#
# county_patterns_specific_NAICS <- normBG(patterns %>% filter(naics_code %in% specific_NAICS), hps) %>% dplyr::select(origin_census_block_group, safegraph_place_id, visit_counts_high, visit_counts_low, naics_code, naics_4dig, naics_2dig) %>% filter(origin_census_block_group %in% county_bg_zctas$blockgroup) %>%
# left_join(county_bg_zctas %>% as.data.frame() %>% dplyr::select(blockgroup, zipcode), by = c("origin_census_block_group" = "blockgroup")) %>% group_by(naics_code, zipcode) %>%
# summarize(total_visits_high = sum(visit_counts_high),
# total_visits_low = sum(visit_counts_low)) %>%
# arrange(zipcode)
#
# return(list(county_patterns_by_NAICS_2dig, county_patterns_specific_NAICS))
# }
#
# # get files to process
# files_in_dir <- list.files(paste0(sg_path, "weekly-patterns/v2/main-file/"), pattern = "rds")
# # want the ones from 3/9 onwards
# files_to_process = files_in_dir[63:78]
#
# # specific NAICS codes of interest - bars (722410)
# specific_NAICS <- "722410"
#
# # data frames to store the results
# specific_NAICS_visits <- data.frame(naics_code = numeric(), zipcode = character(), total_visits_high = numeric(), total_visits_low = numeric(), week_start = character(), stringsAsFactors = FALSE)
# NAICS_2dig_visits <- data.frame(naics_2dig = numeric(), zipcode = character(), total_visits_high = numeric(), total_visits_low = numeric(), week_start = character(), stringsAsFactors = FALSE)
#
# # process the weekly patterns files
# for (i in 1:length(files_to_process)) {
# date_start <- substr(files_to_process[i], 1, 10)
# patterns_file <- files_to_process[i]
# hps_file <- paste0(date_start, "-home-panel-summary.rds")
#
# results <- processPatternsWeeklyNAICS(patterns_file, hps_file, ac_bg_zctas, specific_NAICS, date_start)
# curr_NAICS_2dig_visits <- results[[1]] %>% mutate(week_start = date_start) %>% ungroup()
# curr_specific_NAICS_visits <- results[[2]] %>% mutate(week_start = date_start) %>% ungroup()
#
# # join with existing results
# NAICS_2dig_visits <- rbind(NAICS_2dig_visits, curr_NAICS_2dig_visits)
# specific_NAICS_visits <- rbind(specific_NAICS_visits, curr_specific_NAICS_visits)
# }
#
# specific_NAICS_visits <- specific_NAICS_visits %>% filter(!is.na(zipcode)) %>%
# mutate(total_visits = (total_visits_high + total_visits_low)/2)
# NAICS_2dig_visits <- NAICS_2dig_visits %>% filter(!is.na(zipcode)) %>%
# mutate(total_visits = (total_visits_high + total_visits_low)/2)
#
# # save results
# saveRDS(NAICS_2dig_visits, paste0(sg_path, "weekly-patterns/v2/ac_zip_visits_by_NAICS_03-09-20_06-28-20.rds"))
# saveRDS(specific_NAICS_visits, paste0(sg_path, "weekly-patterns/v2/ac_zip_visits_bars_03-09-20_06-28-20.rds"))
# read in results
ac_NAICS_2dig_visits <- readRDS(paste0(sg_path, "weekly-patterns/v2/ac_zip_visits_by_NAICS_03-09-20_06-28-20.rds"))
ac_bar_visits <- readRDS(paste0(sg_path, "weekly-patterns/v2/ac_zip_visits_bars_03-09-20_06-28-20.rds"))
# compare to total weekly visits
ac_visits_zip_1 <- readRDS(paste0(sg_path, "weekly-patterns/v2/ac_zip_visits_daily_sum_hourly_weighted_v1_03-09-20_05-24-20.rds"))
ac_visits_zip_2 <- readRDS(paste0(sg_path, "weekly-patterns/v2/ac_zip_visits_daily_sum_hourly_weighted_v1_05-25-20_06-21-20.rds"))
ac_visits_zip <- rbind(ac_visits_zip_1, ac_visits_zip_2)
ac_visits_zip <- ac_visits_zip %>% left_join(ac_pop_zip) %>%
mutate(weighted_visits_per_capita = total_weighted_visits / total_pop_zip,
weighted_visit_hours_per_capita = total_weighted_visit_hours / total_pop_zip,
weighted_visit_hours_per_capita_v2 = total_weighted_visit_hours_v2 / total_pop_zip,
visits_per_capita = total_visits / total_pop_zip,
visit_hours_per_capita = total_visit_hours / total_pop_zip,
visits_per_area_per_capita = total_visits_per_area / total_pop_zip)
ac_visits_week <- ac_visits_zip %>%
mutate(week_num = floor((date - as.Date("2020-03-09")) / 7 + 1)) %>%
group_by(zipcode, week_num) %>%
summarize(visits_per_capita_week = sum(visits_per_capita),
weighted_visits_per_capita_week = sum(weighted_visits_per_capita),
visits_week = sum(total_visits))
# calculate the same metric but from the NAICS code disaggregated values
ac_visits_week_NAICS <- ac_NAICS_2dig_visits %>%
group_by(week_start, zipcode) %>%
summarize(visits_week = sum(total_visits)) %>%
mutate(week_num = (as.Date(week_start) - as.Date("2020-03-09"))/7 + 1) %>% arrange(zipcode) %>%
ungroup() %>%
dplyr::select(week_num, zipcode, visits_week)
# summary(comparedf(ac_visits_week_NAICS %>% filter(week_num <= 15), ac_visits_week %>% dplyr::select(week_num, zipcode, visits_week))) # other visits data only has through week 15
# they are the same, good!
# get the visits per capita
ac_NAICS_2dig_visits <- ac_NAICS_2dig_visits %>% left_join(ac_pop_zip) %>%
mutate(visits_per_capita = total_visits / total_pop_zip)
ac_bar_visits <- ac_bar_visits %>% left_join(ac_pop_zip) %>%
mutate(visits_per_capita = total_visits / total_pop_zip)
Now look at the correlation between the visits to each NAICS code and the change in cases. Like in the previous version of this analysis (v8), I look at the change in cases with all data, with only data points that start above zero, change in log of cases, change in cases with only data points that start above 10, and change in log of cases with only data points that start above 10. However, for simplicity, I will only present change in cases with all data and change in cases with only data points that start above 10 in plots here.
Note that I had to filter to non-NA NAICS codes, which means that we do lose some of the data, but it is not a ton.
visits_lag <- 2 # lag on visits, in weeks
# get weekly change in cases
ac_cases_zip_week <- ac_cases_zip %>%
filter(date >= as.Date("2020-03-23")) %>%
filter(as.numeric((date - as.Date("2020-03-23"))) %% 7 == 0) %>%
mutate(log_cases_by_pop = log(cases_by_pop)) %>%
group_by(zipcode) %>%
mutate(change_cases_by_pop = c(NA, diff(cases_by_pop)),
change_log_cases_by_pop = c(NA, diff(log_cases_by_pop)),
change_cases = c(NA, diff(cases))) %>%
mutate(corresponding_visits_week_start = as.character((date - (visits_lag + 1)*7))) %>% # note the use of visits_lag + 1 here, because here the date is the END date of the cases change week
mutate(prev_cases = cases - change_cases) # this gives the level of cases at the START of the week that ENDS on the date listed here, for use later
# join the change in cases with the visits data by NAICS code. also filter to non-NA NAICS codes
ac_visits_NAICS_cases <- ac_NAICS_2dig_visits %>% left_join(ac_cases_zip_week %>% dplyr::select(zipcode, prev_cases, change_cases_by_pop, change_log_cases_by_pop, corresponding_visits_week_start), by = c("zipcode" = "zipcode", "week_start" = "corresponding_visits_week_start")) %>%
rename(cases = prev_cases) %>%
filter(!is.na(cases) & !is.na(naics_2dig))
ac_visits_bars_cases <- ac_bar_visits %>% left_join(ac_cases_zip_week %>% dplyr::select(zipcode, prev_cases, change_cases_by_pop, change_log_cases_by_pop, corresponding_visits_week_start), by = c("zipcode" = "zipcode", "week_start" = "corresponding_visits_week_start")) %>%
rename(cases = prev_cases) %>%
filter(!is.na(cases))
# run the weekly analysis for each NAICS code
NAICS_codes <- unique(ac_visits_NAICS_cases$naics_2dig)
week_start_vals <- unique(ac_visits_NAICS_cases$week_start)
# data frame for storing results
model_result_vals <- data.frame(model_type = character(), coef_val = numeric(), p_val = numeric(), r_squared = numeric(), num_points = numeric(), week_start = character(), naics_2dig = numeric(), stringsAsFactors = FALSE)
for (i in 1:length(NAICS_codes)) {
curr_NAICS <- NAICS_codes[i]
for (j in 1:length(week_start_vals)) {
curr_week_start <- week_start_vals[j]
curr_ac_visits_cases <- ac_visits_NAICS_cases %>% filter(naics_2dig == curr_NAICS) %>% filter(week_start == curr_week_start)
curr_ac_visits_cases_above0 <- curr_ac_visits_cases %>% filter(cases > 0)
curr_ac_visits_cases_above10 <- curr_ac_visits_cases %>% filter(cases >= 10)
# store results
if (nrow(curr_ac_visits_cases) > 1) {
visits_cases_change_model <- lm(change_cases_by_pop ~ visits_per_capita, curr_ac_visits_cases)
change_cases <- data.frame("change in cases", summary(visits_cases_change_model)$coefficients[2,1], summary(visits_cases_change_model)$coefficients[2,4], summary(visits_cases_change_model)$r.squared, nrow(curr_ac_visits_cases))
colnames(change_cases) <- c("model_type", "coef_val", "p_val", "r_squared", "num_points")
change_cases <- change_cases %>% mutate(week_start = curr_week_start, naics_2dig = curr_NAICS)
model_result_vals <- rbind(model_result_vals, change_cases)
}
if (nrow(curr_ac_visits_cases_above0) > 1) {
visits_cases_change_model_above0 <- lm(change_cases_by_pop ~ visits_per_capita, curr_ac_visits_cases_above0)
visits_cases_change_model_log <- lm(change_log_cases_by_pop ~ visits_per_capita, curr_ac_visits_cases_above0)
change_cases_above0 <- data.frame("change in cases, starting above 0", summary(visits_cases_change_model_above0)$coefficients[2,1], summary(visits_cases_change_model_above0)$coefficients[2,4], summary(visits_cases_change_model_above0)$r.squared, nrow(curr_ac_visits_cases_above0))
colnames(change_cases_above0) <- c("model_type", "coef_val", "p_val", "r_squared", "num_points")
change_cases_above0 <- change_cases_above0 %>% mutate(week_start = curr_week_start, naics_2dig = curr_NAICS)
model_result_vals <- rbind(model_result_vals, change_cases_above0)
change_cases_log <- data.frame("change in log of cases, starting above 0", summary(visits_cases_change_model_log)$coefficients[2,1], summary(visits_cases_change_model_log)$coefficients[2,4], summary(visits_cases_change_model_log)$r.squared, nrow(curr_ac_visits_cases_above0))
colnames(change_cases_log) <- c("model_type", "coef_val", "p_val", "r_squared", "num_points")
change_cases_log <- change_cases_log %>% mutate(week_start = curr_week_start, naics_2dig = curr_NAICS)
model_result_vals <- rbind(model_result_vals, change_cases_log)
}
if (nrow(curr_ac_visits_cases_above10) > 1) {
visits_cases_change_model_above10 <- lm(change_cases_by_pop ~ visits_per_capita, curr_ac_visits_cases_above10)
visits_cases_change_model_above10_log <- lm(change_log_cases_by_pop ~ visits_per_capita, curr_ac_visits_cases_above10)
change_cases_above10 <- data.frame("change in cases, starting above 10", summary(visits_cases_change_model_above10)$coefficients[2,1], summary(visits_cases_change_model_above10)$coefficients[2,4], summary(visits_cases_change_model_above10)$r.squared, nrow(curr_ac_visits_cases_above10))
colnames(change_cases_above10) <- c("model_type", "coef_val", "p_val", "r_squared", "num_points")
change_cases_above10 <- change_cases_above10 %>% mutate(week_start = curr_week_start, naics_2dig = curr_NAICS)
model_result_vals <- rbind(model_result_vals, change_cases_above10)
change_cases_log_10 <- data.frame("change in log of cases, starting above 10", summary(visits_cases_change_model_above10_log)$coefficients[2,1], summary(visits_cases_change_model_above10_log)$coefficients[2,4], summary(visits_cases_change_model_above10_log)$r.squared, nrow(curr_ac_visits_cases_above10))
colnames(change_cases_log_10) <- c("model_type", "coef_val", "p_val", "r_squared", "num_points")
change_cases_log_10 <- change_cases_log_10 %>% mutate(week_start = curr_week_start, naics_2dig = curr_NAICS)
model_result_vals <- rbind(model_result_vals, change_cases_log_10)
}
}
}
# plot the r squared over time for each NAICS code, for the change in cases metric
model_result_vals %>% filter(model_type == "change in cases") %>%
plot_ly() %>%
add_trace(x = ~week_start, y = ~r_squared, color = ~naics_2dig, type = 'scatter', mode = 'markers', text = ~num_points) %>%
add_trace(x = ~week_start, y = ~r_squared, color = ~naics_2dig, type = 'scatter', mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = "Start date of visits"), yaxis = list(title = "R squared for that model"), title = "R squared over time, 1 week time interval, 14 day lag, Alameda")
model_result_vals %>% filter(model_type == "change in cases, starting above 10") %>%
plot_ly() %>%
add_trace(x = ~week_start, y = ~r_squared, color = ~naics_2dig, type = 'scatter', mode = 'markers', text = ~num_points) %>%
add_trace(x = ~week_start, y = ~r_squared, color = ~naics_2dig, type = 'scatter', mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = "Start date of visits"), yaxis = list(title = "R squared for that model"), title = "R squared over time, 1 week time interval, 14 day lag, Alameda")
In these plots, the number displayed in the text for each point is the number of data points that went into the model for that point.
The really high r-squared in the middle is for NAICS code 11, which is agriculture–those only had 2 data points on those dates, so it makes sense to exclude them.
model_result_vals %>% filter(model_type == "change in cases" & naics_2dig != "11") %>%
plot_ly() %>%
add_trace(x = ~week_start, y = ~r_squared, color = ~naics_2dig, type = 'scatter', mode = 'markers', text = ~num_points) %>%
add_trace(x = ~week_start, y = ~r_squared, color = ~naics_2dig, type = 'scatter', mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = "Start date of visits"), yaxis = list(title = "R squared for that model"), title = "R squared over time, 1 week time interval, 14 day lag, Alameda")
model_result_vals %>% filter(model_type == "change in cases, starting above 10" & naics_2dig != "11") %>%
plot_ly() %>%
add_trace(x = ~week_start, y = ~r_squared, color = ~naics_2dig, type = 'scatter', mode = 'markers', text = ~num_points) %>%
add_trace(x = ~week_start, y = ~r_squared, color = ~naics_2dig, type = 'scatter', mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = "Start date of visits"), yaxis = list(title = "R squared for that model"), title = "R squared over time, 1 week time interval, 14 day lag, Alameda")
From this, we see that in particular the NAICS codes 81, 72, and 44 do very well in the middle time frame. These correspond to “other services,” “accommodation and food services,” and one of the “retail trade” codes. This is pretty much what we would expect.
Also look at the coefficients and p values.
model_result_vals %>% filter(model_type == "change in cases" & naics_2dig != "11") %>%
plot_ly() %>%
add_trace(x = ~week_start, y = ~coef_val, color = ~naics_2dig, type = 'scatter', mode = 'markers', text = ~num_points) %>%
add_trace(x = ~week_start, y = ~coef_val, color = ~naics_2dig, type = 'scatter', mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = "Start date of visits"), yaxis = list(title = "Coefficient on visits for that model"), title = "Coef on visits over time, 1 week time interval, 14 day lag, Alameda")
model_result_vals %>% filter(model_type == "change in cases, starting above 10" & naics_2dig != "11") %>%
plot_ly() %>%
add_trace(x = ~week_start, y = ~coef_val, color = ~naics_2dig, type = 'scatter', mode = 'markers', text = ~num_points) %>%
add_trace(x = ~week_start, y = ~coef_val, color = ~naics_2dig, type = 'scatter', mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = "Start date of visits"), yaxis = list(title = "Coefficient on visits for that model"), title = "Coef on visits over time, 1 week time interval, 14 day lag, Alameda")
model_result_vals %>% filter(model_type == "change in cases" & naics_2dig != "11") %>%
plot_ly() %>%
add_trace(x = ~week_start, y = ~p_val, color = ~naics_2dig, type = 'scatter', mode = 'markers', text = ~num_points) %>%
add_trace(x = ~week_start, y = ~p_val, color = ~naics_2dig, type = 'scatter', mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = "Start date of visits"), yaxis = list(title = "P value of coefficient for that model"), title = "P val over time, 1 week time interval, 14 day lag, Alameda")
model_result_vals %>% filter(model_type == "change in cases, starting above 10" & naics_2dig != "11") %>%
plot_ly() %>%
add_trace(x = ~week_start, y = ~p_val, color = ~naics_2dig, type = 'scatter', mode = 'markers', text = ~num_points) %>%
add_trace(x = ~week_start, y = ~p_val, color = ~naics_2dig, type = 'scatter', mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = "Start date of visits"), yaxis = list(title = "P value of coefficient for that model"), title = "P val over time, 1 week time interval, 14 day lag, Alameda")
Again, 72, 44, 81 seem relevant in the middle time period.
Look at bars in particular.
# run the weekly analysis for each NAICS code
week_start_vals_bars <- unique(ac_visits_bars_cases$week_start)
# data frame for storing results
model_result_vals_bars <- data.frame(model_type = character(), coef_val = numeric(), p_val = numeric(), r_squared = numeric(), num_points = numeric(), week_start = character(), stringsAsFactors = FALSE)
for (j in 1:length(week_start_vals_bars)) {
curr_week_start <- week_start_vals_bars[j]
curr_ac_visits_cases <- ac_visits_bars_cases %>% filter(week_start == curr_week_start)
curr_ac_visits_cases_above0 <- curr_ac_visits_cases %>% filter(cases > 0)
curr_ac_visits_cases_above10 <- curr_ac_visits_cases %>% filter(cases >= 10)
# store results
if (nrow(curr_ac_visits_cases) > 1) {
visits_cases_change_model <- lm(change_cases_by_pop ~ visits_per_capita, curr_ac_visits_cases)
change_cases <- data.frame("change in cases", summary(visits_cases_change_model)$coefficients[2,1], summary(visits_cases_change_model)$coefficients[2,4], summary(visits_cases_change_model)$r.squared, nrow(curr_ac_visits_cases))
colnames(change_cases) <- c("model_type", "coef_val", "p_val", "r_squared", "num_points")
change_cases <- change_cases %>% mutate(week_start = curr_week_start)
model_result_vals_bars <- rbind(model_result_vals_bars, change_cases)
}
if (nrow(curr_ac_visits_cases_above0) > 1) {
visits_cases_change_model_above0 <- lm(change_cases_by_pop ~ visits_per_capita, curr_ac_visits_cases_above0)
visits_cases_change_model_log <- lm(change_log_cases_by_pop ~ visits_per_capita, curr_ac_visits_cases_above0)
change_cases_above0 <- data.frame("change in cases, starting above 0", summary(visits_cases_change_model_above0)$coefficients[2,1], summary(visits_cases_change_model_above0)$coefficients[2,4], summary(visits_cases_change_model_above0)$r.squared, nrow(curr_ac_visits_cases_above0))
colnames(change_cases_above0) <- c("model_type", "coef_val", "p_val", "r_squared", "num_points")
change_cases_above0 <- change_cases_above0 %>% mutate(week_start = curr_week_start)
model_result_vals_bars <- rbind(model_result_vals_bars, change_cases_above0)
change_cases_log <- data.frame("change in log of cases, starting above 0", summary(visits_cases_change_model_log)$coefficients[2,1], summary(visits_cases_change_model_log)$coefficients[2,4], summary(visits_cases_change_model_log)$r.squared, nrow(curr_ac_visits_cases_above0))
colnames(change_cases_log) <- c("model_type", "coef_val", "p_val", "r_squared", "num_points")
change_cases_log <- change_cases_log %>% mutate(week_start = curr_week_start)
model_result_vals_bars <- rbind(model_result_vals_bars, change_cases_log)
}
if (nrow(curr_ac_visits_cases_above10) > 1) {
visits_cases_change_model_above10 <- lm(change_cases_by_pop ~ visits_per_capita, curr_ac_visits_cases_above10)
visits_cases_change_model_above10_log <- lm(change_log_cases_by_pop ~ visits_per_capita, curr_ac_visits_cases_above10)
change_cases_above10 <- data.frame("change in cases, starting above 10", summary(visits_cases_change_model_above10)$coefficients[2,1], summary(visits_cases_change_model_above10)$coefficients[2,4], summary(visits_cases_change_model_above10)$r.squared, nrow(curr_ac_visits_cases_above10))
colnames(change_cases_above10) <- c("model_type", "coef_val", "p_val", "r_squared", "num_points")
change_cases_above10 <- change_cases_above10 %>% mutate(week_start = curr_week_start)
model_result_vals_bars <- rbind(model_result_vals_bars, change_cases_above10)
change_cases_log_10 <- data.frame("change in log of cases, starting above 10", summary(visits_cases_change_model_above10_log)$coefficients[2,1], summary(visits_cases_change_model_above10_log)$coefficients[2,4], summary(visits_cases_change_model_above10_log)$r.squared, nrow(curr_ac_visits_cases_above10))
colnames(change_cases_log_10) <- c("model_type", "coef_val", "p_val", "r_squared", "num_points")
change_cases_log_10 <- change_cases_log_10 %>% mutate(week_start = curr_week_start)
model_result_vals_bars <- rbind(model_result_vals_bars, change_cases_log_10)
}
}
# plot the r squared over time
model_result_vals_bars %>% filter(model_type == "change in cases") %>%
plot_ly() %>%
add_trace(x = ~week_start, y = ~r_squared, type = 'scatter', mode = 'markers', text = ~num_points) %>%
add_trace(x = ~week_start, y = ~r_squared, type = 'scatter', mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = "Start date of visits"), yaxis = list(title = "R squared for that model"), title = "R squared over time, 1 week time interval, 14 day lag, Alameda, bars visits")
model_result_vals_bars %>% filter(model_type == "change in cases, starting above 10") %>%
plot_ly() %>%
add_trace(x = ~week_start, y = ~r_squared, type = 'scatter', mode = 'markers', text = ~num_points) %>%
add_trace(x = ~week_start, y = ~r_squared, type = 'scatter', mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = "Start date of visits"), yaxis = list(title = "R squared for that model"), title = "R squared over time, 1 week time interval, 14 day lag, Alameda, bars visits")
# coefficient
model_result_vals_bars %>% filter(model_type == "change in cases") %>%
plot_ly() %>%
add_trace(x = ~week_start, y = ~coef_val, type = 'scatter', mode = 'markers', text = ~num_points) %>%
add_trace(x = ~week_start, y = ~coef_val, type = 'scatter', mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = "Start date of visits"), yaxis = list(title = "Coefficient on visits for that model"), title = "Visits coef over time, 1 week time interval, 14 day lag, Alameda, bars visits")
model_result_vals_bars %>% filter(model_type == "change in cases, starting above 10") %>%
plot_ly() %>%
add_trace(x = ~week_start, y = ~coef_val, type = 'scatter', mode = 'markers', text = ~num_points) %>%
add_trace(x = ~week_start, y = ~coef_val, type = 'scatter', mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = "Start date of visits"), yaxis = list(title = "Coefficient on visits for that model"), title = "Visits coef over time, 1 week time interval, 14 day lag, Alameda, bars visits")
# p value
model_result_vals_bars %>% filter(model_type == "change in cases") %>%
plot_ly() %>%
add_trace(x = ~week_start, y = ~p_val, type = 'scatter', mode = 'markers', text = ~num_points) %>%
add_trace(x = ~week_start, y = ~p_val, type = 'scatter', mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = "Start date of visits"), yaxis = list(title = "P value of coef for that model"), title = "P val over time, 1 week time interval, 14 day lag, Alameda, bars visits")
model_result_vals_bars %>% filter(model_type == "change in cases, starting above 10") %>%
plot_ly() %>%
add_trace(x = ~week_start, y = ~p_val, type = 'scatter', mode = 'markers', text = ~num_points) %>%
add_trace(x = ~week_start, y = ~p_val, type = 'scatter', mode = 'lines', showlegend = F) %>%
layout(xaxis = list(title = "Start date of visits"), yaxis = list(title = "P value of coef for that model"), title = "P val over time, 1 week time interval, 14 day lag, Alameda, bars visits")